// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef TESSERACT_OPENCL_OCLKERNELS_H_
#  define TESSERACT_OPENCL_OCLKERNELS_H_

#  ifndef USE_EXTERNAL_KERNEL
#    define KERNEL(...) #    __VA_ARGS__ "\n"
// Double precision is a default of spreadsheets
// cl_khr_fp64: Khronos extension
// cl_amd_fp64: AMD extension
// use build option outside to define fp_t
/////////////////////////////////////////////
static const char *kernel_src = KERNEL(
\n #ifdef KHR_DP_EXTENSION\n
\n #pragma OPENCL EXTENSION cl_khr_fp64
    : enable\n
\n #elif AMD_DP_EXTENSION\n
\n #pragma OPENCL EXTENSION cl_amd_fp64
    : enable\n
\n #else \n
\n #endif \n
        __kernel void composeRGBPixel(__global uint *tiffdata, int w, int h, int wpl, __global uint *output) {
            int i = get_global_id(1);
            int j = get_global_id(0);
            int tiffword, rval, gval, bval;

            //Ignore the excess
            if ((i >= h) || (j >= w))
                return;

            tiffword = tiffdata[i * w + j];
            rval = ((tiffword)&0xff);
            gval = (((tiffword) >> 8) & 0xff);
            bval = (((tiffword) >> 16) & 0xff);
            output[i * wpl + j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
        })

    KERNEL(
\n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword, const int wpl, const int h) {
        const unsigned int row = get_global_id(1);
        const unsigned int col = get_global_id(0);
        const unsigned int pos = row * wpl + col;

        //Ignore the execss
        if (row >= h || col >= wpl)
            return;

        *(dword + pos) &= ~(*(sword + pos));
    }\n)

        KERNEL(
\n__kernel void morphoDilateHor_5x5(__global int *sword, __global int *dword, const int wpl, const int h) {
            const unsigned int pos = get_global_id(0);
            unsigned int prevword, nextword, currword, tempword;
            unsigned int destword;
            const int col = pos % wpl;

            //Ignore the execss
            if (pos >= (wpl * h))
                return;

            currword = *(sword + pos);
            destword = currword;

            //Handle boundary conditions
            if (col == 0)
                prevword = 0;
            else
                prevword = *(sword + pos - 1);

            if (col == (wpl - 1))
                nextword = 0;
            else
                nextword = *(sword + pos + 1);

            //Loop unrolled

            //1 bit to left and 1 bit to right
            //Get the max value on LHS of every pixel
            tempword = (prevword << (31)) | ((currword >> 1));
            destword |= tempword;
            //Get max value on RHS of every pixel
            tempword = (currword << 1) | (nextword >> (31));
            destword |= tempword;

            //2 bit to left and 2 bit to right
            //Get the max value on LHS of every pixel
            tempword = (prevword << (30)) | ((currword >> 2));
            destword |= tempword;
            //Get max value on RHS of every pixel
            tempword = (currword << 2) | (nextword >> (30));
            destword |= tempword;

            *(dword + pos) = destword;
        }\n)

            KERNEL(
\n__kernel void morphoDilateVer_5x5(__global int *sword, __global int *dword, const int wpl, const int h) {
                const int col = get_global_id(0);
                const int row = get_global_id(1);
                const unsigned int pos = row * wpl + col;
                unsigned int tempword;
                unsigned int destword;
                int i;

                //Ignore the execss
                if (row >= h || col >= wpl)
                    return;

                destword = *(sword + pos);

                //2 words above
                i = (row - 2) < 0 ? row : (row - 2);
                tempword = *(sword + i * wpl + col);
                destword |= tempword;

                //1 word above
                i = (row - 1) < 0 ? row : (row - 1);
                tempword = *(sword + i * wpl + col);
                destword |= tempword;

                //1 word below
                i = (row >= (h - 1)) ? row : (row + 1);
                tempword = *(sword + i * wpl + col);
                destword |= tempword;

                //2 words below
                i = (row >= (h - 2)) ? row : (row + 2);
                tempword = *(sword + i * wpl + col);
                destword |= tempword;

                *(dword + pos) = destword;
            }\n)

                KERNEL(
\n__kernel void morphoDilateHor(__global int *sword, __global int *dword, const int xp, const int xn, const int wpl, const int h) {
                    const int col = get_global_id(0);
                    const int row = get_global_id(1);
                    const unsigned int pos = row * wpl + col;
                    unsigned int parbitsxp, parbitsxn, nwords;
                    unsigned int destword, tempword, lastword, currword;
                    unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
                    int i, j, siter, eiter;

                    //Ignore the execss
                    if (pos >= (wpl * h) || (xn < 1 && xp < 1))
                        return;

                    currword = *(sword + pos);
                    destword = currword;

                    parbitsxp = xp & 31;
                    parbitsxn = xn & 31;
                    nwords = xp >> 5;

                    if (parbitsxp > 0)
                        nwords += 1;
                    else
                        parbitsxp = 31;

                    siter = (col - nwords);
                    eiter = (col + nwords);

                    //Get prev word
                    if (col == 0)
                        firstword = 0x0;
                    else
                        firstword = *(sword + pos - 1);

                    //Get next word
                    if (col == (wpl - 1))
                        secondword = 0x0;
                    else
                        secondword = *(sword + pos + 1);

                    //Last partial bits on either side
                    for (i = 1; i <= parbitsxp; i++) {
                        //Get the max value on LHS of every pixel
                        tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32 - i)) | ((currword >> i));

                        destword |= tempword;

                        //Get max value on RHS of every pixel
                        tempword = (currword << i) | (secondword >> (32 - i));
                        destword |= tempword;
                    }

                    //Return if halfwidth <= 1 word
                    if (nwords == 1) {
                        if (xn == 32) {
                            destword |= firstword;
                        }
                        if (xp == 32) {
                            destword |= secondword;
                        }

                        *(dword + pos) = destword;
                        return;
                    }

                    if (siter < 0)
                        firstword = 0x0;
                    else
                        firstword = *(sword + row * wpl + siter);

                    if (eiter >= wpl)
                        lastword = 0x0;
                    else
                        lastword = *(sword + row * wpl + eiter);

                    for (i = 1; i < nwords; i++) {
                        //Gets LHS words
                        if ((siter + i) < 0)
                            secondword = 0x0;
                        else
                            secondword = *(sword + row * wpl + siter + i);

                        lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;

                        firstword = secondword;

                        if ((siter + i + 1) < 0)
                            secondword = 0x0;
                        else
                            secondword = *(sword + row * wpl + siter + i + 1);

                        lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;

                        //Gets RHS words
                        if ((eiter - i) >= wpl)
                            firstword = 0x0;
                        else
                            firstword = *(sword + row * wpl + eiter - i);

                        rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);

                        lastword = firstword;
                        if ((eiter - i - 1) >= wpl)
                            firstword = 0x0;
                        else
                            firstword = *(sword + row * wpl + eiter - i - 1);

                        rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);

                        for (j = 1; j < 32; j++) {
                            //OR LHS full words
                            tempword = (lprevword << j) | (lnextword >> (32 - j));
                            destword |= tempword;

                            //OR RHS full words
                            tempword = (rprevword << j) | (rnextword >> (32 - j));
                            destword |= tempword;
                        }

                        destword |= lprevword;
                        destword |= lnextword;
                        destword |= rprevword;
                        destword |= rnextword;

                        lastword = firstword;
                        firstword = secondword;
                    }

                    *(dword + pos) = destword;
                }\n)

                    KERNEL(
\n__kernel void morphoDilateHor_32word(__global int *sword, __global int *dword, const int halfwidth, const int wpl, const int h, const char isEven) {
                        const int col = get_global_id(0);
                        const int row = get_global_id(1);
                        const unsigned int pos = row * wpl + col;
                        unsigned int prevword, nextword, currword, tempword;
                        unsigned int destword;
                        int i;

                        //Ignore the execss
                        if (pos >= (wpl * h))
                            return;

                        currword = *(sword + pos);
                        destword = currword;

                        //Handle boundary conditions
                        if (col == 0)
                            prevword = 0;
                        else
                            prevword = *(sword + pos - 1);

                        if (col == (wpl - 1))
                            nextword = 0;
                        else
                            nextword = *(sword + pos + 1);

                        for (i = 1; i <= halfwidth; i++) {
                            //Get the max value on LHS of every pixel
                            if (i == halfwidth && isEven) {
                                tempword = 0x0;
                            } else {
                                tempword = (prevword << (32 - i)) | ((currword >> i));
                            }

                            destword |= tempword;

                            //Get max value on RHS of every pixel
                            tempword = (currword << i) | (nextword >> (32 - i));

                            destword |= tempword;
                        }

                        *(dword + pos) = destword;
                    }\n)

                        KERNEL(
\n__kernel void morphoDilateVer(__global int *sword, __global int *dword, const int yp, const int wpl, const int h, const int yn) {
                            const int col = get_global_id(0);
                            const int row = get_global_id(1);
                            const unsigned int pos = row * wpl + col;
                            unsigned int tempword;
                            unsigned int destword;
                            int i, siter, eiter;

                            //Ignore the execss
                            if (row >= h || col >= wpl)
                                return;

                            destword = *(sword + pos);

                            //Set start position and end position considering the boundary conditions
                            siter = (row - yn) < 0 ? 0 : (row - yn);
                            eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);

                            for (i = siter; i <= eiter; i++) {
                                tempword = *(sword + i * wpl + col);

                                destword |= tempword;
                            }

                            *(dword + pos) = destword;
                        }\n)

                            KERNEL(
\n__kernel void morphoErodeHor_5x5(__global int *sword, __global int *dword, const int wpl, const int h) {
                                const unsigned int pos = get_global_id(0);
                                unsigned int prevword, nextword, currword, tempword;
                                unsigned int destword;
                                const int col = pos % wpl;

                                //Ignore the execss
                                if (pos >= (wpl * h))
                                    return;

                                currword = *(sword + pos);
                                destword = currword;

                                //Handle boundary conditions
                                if (col == 0)
                                    prevword = 0xffffffff;
                                else
                                    prevword = *(sword + pos - 1);

                                if (col == (wpl - 1))
                                    nextword = 0xffffffff;
                                else
                                    nextword = *(sword + pos + 1);

                                //Loop unrolled

                                //1 bit to left and 1 bit to right
                                //Get the min value on LHS of every pixel
                                tempword = (prevword << (31)) | ((currword >> 1));
                                destword &= tempword;
                                //Get min value on RHS of every pixel
                                tempword = (currword << 1) | (nextword >> (31));
                                destword &= tempword;

                                //2 bit to left and 2 bit to right
                                //Get the min value on LHS of every pixel
                                tempword = (prevword << (30)) | ((currword >> 2));
                                destword &= tempword;
                                //Get min value on RHS of every pixel
                                tempword = (currword << 2) | (nextword >> (30));
                                destword &= tempword;

                                *(dword + pos) = destword;
                            }\n)

                                KERNEL(
\n__kernel void morphoErodeVer_5x5(__global int *sword, __global int *dword, const int wpl, const int h, const int fwmask, const int lwmask) {
                                    const int col = get_global_id(0);
                                    const int row = get_global_id(1);
                                    const unsigned int pos = row * wpl + col;
                                    unsigned int tempword;
                                    unsigned int destword;
                                    int i;

                                    //Ignore the execss
                                    if (row >= h || col >= wpl)
                                        return;

                                    destword = *(sword + pos);

                                    if (row < 2 || row >= (h - 2)) {
                                        destword = 0x0;
                                    } else {
                                        //2 words above
                                        //i = (row - 2) < 0 ? row : (row - 2);
                                        i = (row - 2);
                                        tempword = *(sword + i * wpl + col);
                                        destword &= tempword;

                                        //1 word above
                                        //i = (row - 1) < 0 ? row  : (row - 1);
                                        i = (row - 1);
                                        tempword = *(sword + i * wpl + col);
                                        destword &= tempword;

                                        //1 word below
                                        //i = (row >= (h - 1)) ? row : (row + 1);
                                        i = (row + 1);
                                        tempword = *(sword + i * wpl + col);
                                        destword &= tempword;

                                        //2 words below
                                        //i = (row >= (h - 2)) ? row : (row + 2);
                                        i = (row + 2);
                                        tempword = *(sword + i * wpl + col);
                                        destword &= tempword;

                                        if (col == 0) {
                                            destword &= fwmask;
                                        }
                                        if (col == (wpl - 1)) {
                                            destword &= lwmask;
                                        }
                                    }

                                    *(dword + pos) = destword;
                                }\n)

                                    KERNEL(
\n__kernel void morphoErodeHor(__global int *sword, __global int *dword, const int xp, const int xn, const int wpl, const int h, const char isAsymmetric, const int rwmask, const int lwmask) {
                                        const int col = get_global_id(0);
                                        const int row = get_global_id(1);
                                        const unsigned int pos = row * wpl + col;
                                        unsigned int parbitsxp, parbitsxn, nwords;
                                        unsigned int destword, tempword, lastword, currword;
                                        unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
                                        int i, j, siter, eiter;

                                        //Ignore the execss
                                        if (pos >= (wpl * h) || (xn < 1 && xp < 1))
                                            return;

                                        currword = *(sword + pos);
                                        destword = currword;

                                        parbitsxp = xp & 31;
                                        parbitsxn = xn & 31;
                                        nwords = xp >> 5;

                                        if (parbitsxp > 0)
                                            nwords += 1;
                                        else
                                            parbitsxp = 31;

                                        siter = (col - nwords);
                                        eiter = (col + nwords);

                                        //Get prev word
                                        if (col == 0)
                                            firstword = 0xffffffff;
                                        else
                                            firstword = *(sword + pos - 1);

                                        //Get next word
                                        if (col == (wpl - 1))
                                            secondword = 0xffffffff;
                                        else
                                            secondword = *(sword + pos + 1);

                                        //Last partial bits on either side
                                        for (i = 1; i <= parbitsxp; i++) {
                                            //Get the max value on LHS of every pixel
                                            tempword = (firstword << (32 - i)) | ((currword >> i));
                                            destword &= tempword;

                                            //Get max value on RHS of every pixel
                                            tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));

                                            //tempword = (currword << i) | (secondword >> (32 - i));
                                            destword &= tempword;
                                        }

                                        //Return if halfwidth <= 1 word
                                        if (nwords == 1) {
                                            if (xp == 32) {
                                                destword &= firstword;
                                            }
                                            if (xn == 32) {
                                                destword &= secondword;
                                            }

                                            //Clear boundary pixels
                                            if (isAsymmetric) {
                                                if (col == 0)
                                                    destword &= rwmask;
                                                if (col == (wpl - 1))
                                                    destword &= lwmask;
                                            }

                                            *(dword + pos) = destword;
                                            return;
                                        }

                                        if (siter < 0)
                                            firstword = 0xffffffff;
                                        else
                                            firstword = *(sword + row * wpl + siter);

                                        if (eiter >= wpl)
                                            lastword = 0xffffffff;
                                        else
                                            lastword = *(sword + row * wpl + eiter);

                                        for (i = 1; i < nwords; i++) {
                                            //Gets LHS words
                                            if ((siter + i) < 0)
                                                secondword = 0xffffffff;
                                            else
                                                secondword = *(sword + row * wpl + siter + i);

                                            lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);

                                            firstword = secondword;

                                            if ((siter + i + 1) < 0)
                                                secondword = 0xffffffff;
                                            else
                                                secondword = *(sword + row * wpl + siter + i + 1);

                                            lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);

                                            //Gets RHS words
                                            if ((eiter - i) >= wpl)
                                                firstword = 0xffffffff;
                                            else
                                                firstword = *(sword + row * wpl + eiter - i);

                                            rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);

                                            lastword = firstword;
                                            if ((eiter - i - 1) >= wpl)
                                                firstword = 0xffffffff;
                                            else
                                                firstword = *(sword + row * wpl + eiter - i - 1);

                                            rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);

                                            for (j = 0; j < 32; j++) {
                                                //OR LHS full words
                                                tempword = (lprevword << j) | (lnextword >> (32 - j));
                                                destword &= tempword;

                                                //OR RHS full words
                                                tempword = (rprevword << j) | (rnextword >> (32 - j));
                                                destword &= tempword;
                                            }

                                            destword &= lprevword;
                                            destword &= lnextword;
                                            destword &= rprevword;
                                            destword &= rnextword;

                                            lastword = firstword;
                                            firstword = secondword;
                                        }

                                        if (isAsymmetric) {
                                            //Clear boundary pixels
                                            if (col < (nwords - 1))
                                                destword = 0x0;
                                            else if (col == (nwords - 1))
                                                destword &= rwmask;
                                            else if (col > (wpl - nwords))
                                                destword = 0x0;
                                            else if (col == (wpl - nwords))
                                                destword &= lwmask;
                                        }

                                        *(dword + pos) = destword;
                                    }\n)

                                        KERNEL(
\n__kernel void morphoErodeHor_32word(__global int *sword, __global int *dword, const int halfwidth, const int wpl, const int h, const char clearBoundPixH, const int rwmask, const int lwmask, const char isEven) {
                                            const int col = get_global_id(0);
                                            const int row = get_global_id(1);
                                            const unsigned int pos = row * wpl + col;
                                            unsigned int prevword, nextword, currword, tempword, destword;
                                            int i;

                                            //Ignore the execss
                                            if (pos >= (wpl * h))
                                                return;

                                            currword = *(sword + pos);
                                            destword = currword;

                                            //Handle boundary conditions
                                            if (col == 0)
                                                prevword = 0xffffffff;
                                            else
                                                prevword = *(sword + pos - 1);

                                            if (col == (wpl - 1))
                                                nextword = 0xffffffff;
                                            else
                                                nextword = *(sword + pos + 1);

                                            for (i = 1; i <= halfwidth; i++) {
                                                //Get the min value on LHS of every pixel
                                                tempword = (prevword << (32 - i)) | ((currword >> i));

                                                destword &= tempword;

                                                //Get min value on RHS of every pixel
                                                if (i == halfwidth && isEven) {
                                                    tempword = 0xffffffff;
                                                } else {
                                                    tempword = (currword << i) | (nextword >> (32 - i));
                                                }

                                                destword &= tempword;
                                            }

                                            if (clearBoundPixH) {
                                                if (col == 0) {
                                                    destword &= rwmask;
                                                } else if (col == (wpl - 1)) {
                                                    destword &= lwmask;
                                                }
                                            }

                                            *(dword + pos) = destword;
                                        }\n)

                                            KERNEL(
\n__kernel void morphoErodeVer(__global int *sword, __global int *dword, const int yp, const int wpl, const int h, const char clearBoundPixV, const int yn) {
                                                const int col = get_global_id(0);
                                                const int row = get_global_id(1);
                                                const unsigned int pos = row * wpl + col;
                                                unsigned int tempword, destword;
                                                int i, siter, eiter;

                                                //Ignore the execss
                                                if (row >= h || col >= wpl)
                                                    return;

                                                destword = *(sword + pos);

                                                //Set start position and end position considering the boundary conditions
                                                siter = (row - yp) < 0 ? 0 : (row - yp);
                                                eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);

                                                for (i = siter; i <= eiter; i++) {
                                                    tempword = *(sword + i * wpl + col);

                                                    destword &= tempword;
                                                }

                                                //Clear boundary pixels
                                                if (clearBoundPixV && ((row < yp) || ((h - row) <= yn))) {
                                                    destword = 0x0;
                                                }

                                                *(dword + pos) = destword;
                                            }\n)

    // HistogramRect Kernel: Accumulate
    // assumes 4 channels, i.e., bytes_per_pixel = 4
    // assumes number of pixels is multiple of 8
    // data is laid out as
    // ch0                                           ch1 ...
    // bin0          bin1            bin2...         bin0...
    // rpt0,1,2...256  rpt0,1,2...
    KERNEL(
\n #define HIST_REDUNDANCY 256\n
\n #define GROUP_SIZE 256\n
\n #define HIST_SIZE 256\n
\n #define NUM_CHANNELS 4\n
\n #define HR_UNROLL_SIZE 8 \n
\n #define HR_UNROLL_TYPE uchar8 \n

        __attribute__((reqd_work_group_size(256, 1, 1))) __kernel void kernel_HistogramRectAllChannels(__global const uchar8 *data, uint numPixels, __global uint *histBuffer) {
            // declare variables
            uchar8 pixels;
            int threadOffset = get_global_id(0) % HIST_REDUNDANCY;

            // for each pixel/channel, accumulate in global memory
            for (uint pc = get_global_id(0); pc < numPixels * NUM_CHANNELS / HR_UNROLL_SIZE; pc += get_global_size(0)) {
                pixels = data[pc];
                //                       channel                        bin                         thread
                atomic_inc(&histBuffer[0 * HIST_SIZE * HIST_REDUNDANCY + pixels.s0 * HIST_REDUNDANCY + threadOffset]);  // ch0
                atomic_inc(&histBuffer[0 * HIST_SIZE * HIST_REDUNDANCY + pixels.s4 * HIST_REDUNDANCY + threadOffset]);  // ch0
                atomic_inc(&histBuffer[1 * HIST_SIZE * HIST_REDUNDANCY + pixels.s1 * HIST_REDUNDANCY + threadOffset]);  // ch1
                atomic_inc(&histBuffer[1 * HIST_SIZE * HIST_REDUNDANCY + pixels.s5 * HIST_REDUNDANCY + threadOffset]);  // ch1
                atomic_inc(&histBuffer[2 * HIST_SIZE * HIST_REDUNDANCY + pixels.s2 * HIST_REDUNDANCY + threadOffset]);  // ch2
                atomic_inc(&histBuffer[2 * HIST_SIZE * HIST_REDUNDANCY + pixels.s6 * HIST_REDUNDANCY + threadOffset]);  // ch2
                atomic_inc(&histBuffer[3 * HIST_SIZE * HIST_REDUNDANCY + pixels.s3 * HIST_REDUNDANCY + threadOffset]);  // ch3
                atomic_inc(&histBuffer[3 * HIST_SIZE * HIST_REDUNDANCY + pixels.s7 * HIST_REDUNDANCY + threadOffset]);  // ch3
            }
        })

        KERNEL(
            // NUM_CHANNELS = 1
            __attribute__((reqd_work_group_size(256, 1, 1))) __kernel void kernel_HistogramRectOneChannel(__global const uchar8 *data, uint numPixels, __global uint *histBuffer) {
                // declare variables
                uchar8 pixels;
                int threadOffset = get_global_id(0) % HIST_REDUNDANCY;

                // for each pixel/channel, accumulate in global memory
                for (uint pc = get_global_id(0); pc < numPixels / HR_UNROLL_SIZE; pc += get_global_size(0)) {
                    pixels = data[pc];
                    //                        bin                         thread
                    atomic_inc(&histBuffer[pixels.s0 * HIST_REDUNDANCY + threadOffset]);
                    atomic_inc(&histBuffer[pixels.s1 * HIST_REDUNDANCY + threadOffset]);
                    atomic_inc(&histBuffer[pixels.s2 * HIST_REDUNDANCY + threadOffset]);
                    atomic_inc(&histBuffer[pixels.s3 * HIST_REDUNDANCY + threadOffset]);
                    atomic_inc(&histBuffer[pixels.s4 * HIST_REDUNDANCY + threadOffset]);
                    atomic_inc(&histBuffer[pixels.s5 * HIST_REDUNDANCY + threadOffset]);
                    atomic_inc(&histBuffer[pixels.s6 * HIST_REDUNDANCY + threadOffset]);
                    atomic_inc(&histBuffer[pixels.s7 * HIST_REDUNDANCY + threadOffset]);
                }
            })

    // HistogramRect Kernel: Reduction
    // only supports 4 channels
    // each work group handles a single channel of a single histogram bin
    KERNEL(__attribute__((reqd_work_group_size(256, 1, 1))) __kernel void kernel_HistogramRectAllChannelsReduction(int n,  // unused pixel redundancy
                                                                                                                   __global uint *histBuffer, __global int *histResult) {
        // declare variables
        int channel = get_group_id(0) / HIST_SIZE;
        int bin = get_group_id(0) % HIST_SIZE;
        int value = 0;

        // accumulate in register
        for (uint i = get_local_id(0); i < HIST_REDUNDANCY; i += GROUP_SIZE) {
            value += histBuffer[channel * HIST_SIZE * HIST_REDUNDANCY + bin * HIST_REDUNDANCY + i];
        }

        // reduction in local memory
        __local int localHist[GROUP_SIZE];
        localHist[get_local_id(0)] = value;
        barrier(CLK_LOCAL_MEM_FENCE);
        for (int stride = GROUP_SIZE / 2; stride >= 1; stride /= 2) {
            if (get_local_id(0) < stride) {
                value = localHist[get_local_id(0) + stride];
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            if (get_local_id(0) < stride) {
                localHist[get_local_id(0)] += value;
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }

        // write reduction to final result
        if (get_local_id(0) == 0) {
            histResult[get_group_id(0)] = localHist[0];
        }
    }  // kernel_HistogramRectAllChannels
           )

        KERNEL(
            // NUM_CHANNELS = 1
            __attribute__((reqd_work_group_size(256, 1, 1))) __kernel void kernel_HistogramRectOneChannelReduction(int n,  // unused pixel redundancy
                                                                                                                   __global uint *histBuffer, __global int *histResult) {
                // declare variables
                // int channel = get_group_id(0)/HIST_SIZE;
                int bin = get_group_id(0) % HIST_SIZE;
                int value = 0;

                // accumulate in register
                for (int i = get_local_id(0); i < HIST_REDUNDANCY; i += GROUP_SIZE) {
                    value += histBuffer[bin * HIST_REDUNDANCY + i];
                }

                // reduction in local memory
                __local int localHist[GROUP_SIZE];
                localHist[get_local_id(0)] = value;
                barrier(CLK_LOCAL_MEM_FENCE);
                for (int stride = GROUP_SIZE / 2; stride >= 1; stride /= 2) {
                    if (get_local_id(0) < stride) {
                        value = localHist[get_local_id(0) + stride];
                    }
                    barrier(CLK_LOCAL_MEM_FENCE);
                    if (get_local_id(0) < stride) {
                        localHist[get_local_id(0)] += value;
                    }
                    barrier(CLK_LOCAL_MEM_FENCE);
                }

                // write reduction to final result
                if (get_local_id(0) == 0) {
                    histResult[get_group_id(0)] = localHist[0];
                }
            }  // kernel_HistogramRectOneChannelReduction
            )

    // ThresholdRectToPix Kernel
    // only supports 4 channels
    // imageData is input image (24-bits/pixel)
    // pix is output image (1-bit/pixel)
    KERNEL(
\n #define CHAR_VEC_WIDTH 4 \n
\n #define PIXELS_PER_WORD 32 \n
\n #define PIXELS_PER_BURST 8 \n
\n #define BURSTS_PER_WORD(PIXELS_PER_WORD / PIXELS_PER_BURST) \n typedef union {
        uchar s[PIXELS_PER_BURST * NUM_CHANNELS];
        uchar4 v[(PIXELS_PER_BURST * NUM_CHANNELS) / CHAR_VEC_WIDTH];
    } charVec;

        __attribute__((reqd_work_group_size(256, 1, 1))) __kernel void kernel_ThresholdRectToPix(__global const uchar4 *imageData, int height, int width,
                                                                                                 int wpl,  // words per line
                                                                                                 __global int *thresholds, __global int *hi_values, __global int *pix) {
            // declare variables
            int pThresholds[NUM_CHANNELS];
            int pHi_Values[NUM_CHANNELS];
            for (int i = 0; i < NUM_CHANNELS; i++) {
                pThresholds[i] = thresholds[i];
                pHi_Values[i] = hi_values[i];
            }

            // for each word (32 pixels) in output image
            for (uint w = get_global_id(0); w < wpl * height; w += get_global_size(0)) {
                unsigned int word = 0;  // all bits start at zero
                // for each burst in word
                for (int b = 0; b < BURSTS_PER_WORD; b++) {
                    // load burst
                    charVec pixels;
                    int offset = (w / wpl) * width;
                    offset += (w % wpl) * PIXELS_PER_WORD;
                    offset += b * PIXELS_PER_BURST;

                    for (int i = 0; i < PIXELS_PER_BURST; ++i)
                        pixels.v[i] = imageData[offset + i];

                    // for each pixel in burst
                    for (int p = 0; p < PIXELS_PER_BURST; p++) {
                        for (int c = 0; c < NUM_CHANNELS; c++) {
                            unsigned char pixChan = pixels.s[p * NUM_CHANNELS + c];
                            if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
                                const uint kTopBit = 0x80000000;
                                word |= (kTopBit >> ((b * PIXELS_PER_BURST + p) & 31));
                            }
                        }
                    }
                }
                pix[w] = word;
            }
        }

\n #define CHAR_VEC_WIDTH 8 \n
\n #define PIXELS_PER_WORD 32 \n
\n #define PIXELS_PER_BURST 8 \n
\n #define BURSTS_PER_WORD(PIXELS_PER_WORD / PIXELS_PER_BURST) \n typedef union {
            uchar s[PIXELS_PER_BURST * 1];
            uchar8 v[(PIXELS_PER_BURST * 1) / CHAR_VEC_WIDTH];
        } charVec1;

        __attribute__((reqd_work_group_size(256, 1, 1))) __kernel void kernel_ThresholdRectToPix_OneChan(__global const uchar8 *imageData, int height, int width,
                                                                                                         int wpl,  // words per line of output image
                                                                                                         __global int *thresholds, __global int *hi_values, __global int *pix) {
            // declare variables
            int pThresholds[1];
            int pHi_Values[1];
            for (int i = 0; i < 1; i++) {
                pThresholds[i] = thresholds[i];
                pHi_Values[i] = hi_values[i];
            }

            // for each word (32 pixels) in output image
            for (uint w = get_global_id(0); w < wpl * height; w += get_global_size(0)) {
                unsigned int word = 0;  // all bits start at zero

                // for each burst in word
                for (int b = 0; b < BURSTS_PER_WORD; b++) {
                    // load burst
                    charVec1 pixels;
                    // for each char8 in burst
                    pixels.v[0] = imageData[w * BURSTS_PER_WORD + b + 0];

                    // for each pixel in burst
                    for (int p = 0; p < PIXELS_PER_BURST; p++) {
                        //int littleEndianIdx = p ^ 3;
                        //int bigEndianIdx = p;
                        int idx =
\n #ifdef __ENDIAN_LITTLE__\n
                            p ^
                            3;
                        \n #else \n
                            p;
                        \n #endif \n unsigned char pixChan = pixels.s[idx];
                        if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) {
                            const uint kTopBit = 0x80000000;
                            word |= (kTopBit >> ((b * PIXELS_PER_BURST + p) & 31));
                        }
                    }
                }
                pix[w] = word;
            }
        })

    ;  // close char*

#  endif // USE_EXTERNAL_KERNEL
#endif   // TESSERACT_OPENCL_OCLKERNELS_H_
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
